train:
  seq_length: 2032
  epochs: 10000
  total_steps: 10000
  batch_size: 18
  checkpoint_interval: 1000
  eval_interval: 250
  pipeline: "CustomPromptPipeline"
  trainer: "CustomPPOTrainer"
  tracker: wandb
  project_name: rlhf
  entity_name: open-assistant

model:
  model_path:
  model_arch_type: causal

tokenizer:
  tokenizer_path:
  truncation_side: "left" # AKo: changed this to "left", otherwise it sees only the beginning of conversations
  padding_side: "left" # use with rotary positional embeddings

optimizer:
  name: "adamw"
  kwargs:
    lr: 1.0e-6
    betas: [0.9, 0.95]
    eps: 1.0e-8
    weight_decay: 1.0e-6

scheduler:
  name: "cosine_annealing"
  kwargs:
    #T_max: 10000 # train.total_steps
    #eta_min: 1.0e-4
    T_max: 10000
    eta_min: 1.0e-6

method:
  name: "ppoconfig"
  num_rollouts: 1024
  chunk_size: 4
  ppo_epochs: 4
  init_kl_coef: 0.1
  target: 6
  horizon: 10000
  gamma: 1
  lam: 0.95
  cliprange: 0.4
  cliprange_value: 0.4
  vf_coef: 1
  scale_reward: False
  ref_mean: null
  ref_std: null
  cliprange_reward: 4
  gen_kwargs:
    max_new_tokens: 512
    top_k: 0
    top_p: 0.7
    do_sample: True
    temperature: 1.0
